/*
 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors:
 *    Rob Clark <robclark@freedesktop.org>
 */

#include "pipe/p_state.h"
#include "pipe/p_screen.h"
#include "util/u_string.h"
#include "util/u_memory.h"
#include "util/u_inlines.h"
#include "util/format/u_format.h"
#include "tgsi/tgsi_dump.h"
#include "tgsi/tgsi_parse.h"

#include "nir/tgsi_to_nir.h"

#include "freedreno_context.h"
#include "freedreno_util.h"

#include "ir3/ir3_cache.h"
#include "ir3/ir3_shader.h"
#include "ir3/ir3_gallium.h"
#include "ir3/ir3_compiler.h"
#include "ir3/ir3_nir.h"

/**
 * The hardware cso for shader state
 *
 * Initially just a container for the ir3_shader, but this is where we'll
 * plumb in async compile.
 */
struct ir3_shader_state {
	struct ir3_shader *shader;

	/* Fence signalled when async compile is completed: */
	struct util_queue_fence ready;
};

/**
 * Should initial variants be compiled synchronously?
 *
 * The only case where pipe_debug_message() is used in the initial-variants
 * path is with FD_MESA_DEBUG=shaderdb.  So if either debug is disabled (ie.
 * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
 * compile the initial shader variant asynchronously.
 */
static bool
initial_variants_synchronous(struct fd_context *ctx)
{
	return unlikely(ctx->debug.debug_message) ||
			FD_DBG(SHADERDB) || FD_DBG(SERIALC);
}

static void
dump_shader_info(struct ir3_shader_variant *v, struct pipe_debug_callback *debug)
{
	if (!FD_DBG(SHADERDB))
		return;

	pipe_debug_message(debug, SHADER_INFO,
			"%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
			"%u dwords, %u last-baryf, %u half, %u full, %u constlen, "
			"%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
			"%u sstall, %u (ss), %u (sy), %d waves, %d max_sun, %d loops\n",
			ir3_shader_stage(v),
			v->info.instrs_count,
			v->info.nops_count,
			v->info.instrs_count - v->info.nops_count,
			v->info.mov_count,
			v->info.cov_count,
			v->info.sizedwords,
			v->info.last_baryf,
			v->info.max_half_reg + 1,
			v->info.max_reg + 1,
			v->constlen,
			v->info.instrs_per_cat[0],
			v->info.instrs_per_cat[1],
			v->info.instrs_per_cat[2],
			v->info.instrs_per_cat[3],
			v->info.instrs_per_cat[4],
			v->info.instrs_per_cat[5],
			v->info.instrs_per_cat[6],
			v->info.instrs_per_cat[7],
			v->info.sstall,
			v->info.ss, v->info.sy,
			v->info.max_waves,
			v->max_sun, v->loops);
}

static void
upload_shader_variant(struct ir3_shader_variant *v)
{
	struct shader_info *info = &v->shader->nir->info;
	struct ir3_compiler *compiler = v->shader->compiler;

	assert(!v->bo);

	v->bo = fd_bo_new(compiler->dev, v->info.size,
			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
			DRM_FREEDRENO_GEM_TYPE_KMEM,
			"%s:%s", ir3_shader_stage(v), info->name);

	/* Always include shaders in kernel crash dumps. */
	fd_bo_mark_for_dump(v->bo);

	memcpy(fd_bo_map(v->bo), v->bin, v->info.size);
}

struct ir3_shader_variant *
ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
		bool binning_pass, struct pipe_debug_callback *debug)
{
	struct ir3_shader_variant *v;
	bool created = false;

	/* Some shader key values may not be used by a given ir3_shader (for
	 * example, fragment shader saturates in the vertex shader), so clean out
	 * those flags to avoid recompiling.
	 */
	ir3_key_clear_unused(&key, shader);

	v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);

	if (created) {
		if (shader->initial_variants_done) {
			pipe_debug_message(debug, SHADER_INFO,
					"%s shader: recompiling at draw time: global 0x%08x, vfsamples %x/%x, astc %x/%x\n",
					ir3_shader_stage(v),
					key.global,
					key.vsamples, key.fsamples,
					key.vastc_srgb, key.fastc_srgb);

		}

		dump_shader_info(v, debug);
		upload_shader_variant(v);

		if (v->binning) {
			upload_shader_variant(v->binning);
			dump_shader_info(v->binning, debug);
		}
	}

	return v;
}

static void
copy_stream_out(struct ir3_stream_output_info *i,
		const struct pipe_stream_output_info *p)
{
	STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));
	STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));

	i->num_outputs = p->num_outputs;
	for (int n = 0; n < ARRAY_SIZE(i->stride); n++)
		i->stride[n] = p->stride[n];

	for (int n = 0; n < ARRAY_SIZE(i->output); n++) {
		i->output[n].register_index  = p->output[n].register_index;
		i->output[n].start_component = p->output[n].start_component;
		i->output[n].num_components  = p->output[n].num_components;
		i->output[n].output_buffer   = p->output[n].output_buffer;
		i->output[n].dst_offset      = p->output[n].dst_offset;
		i->output[n].stream          = p->output[n].stream;
	}
}

static void
create_initial_variants(struct ir3_shader_state *hwcso,
		struct pipe_debug_callback *debug)
{
	struct ir3_shader *shader = hwcso->shader;
	struct ir3_compiler *compiler = shader->compiler;
	nir_shader *nir = shader->nir;

	/* Compile standard variants immediately to try to avoid draw-time stalls
	 * to run the compiler.
	 */
	struct ir3_shader_key key = {
		.tessellation = IR3_TESS_NONE,
		.ucp_enables = MASK(nir->info.clip_distance_array_size),
		.msaa = true,
	};

	switch (nir->info.stage) {
	case MESA_SHADER_TESS_EVAL:
		key.tessellation = ir3_tess_mode(nir->info.tess.primitive_mode);
		break;

	case MESA_SHADER_TESS_CTRL:
		/* The primitive_mode field, while it exists for TCS, is not
		 * populated (since separable shaders between TCS/TES are legal,
		 * so TCS wouldn't have access to TES's declaration).  Make a
		 * guess so that we shader-db something plausible for TCS.
		 */
		if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)
			key.tessellation = IR3_TESS_TRIANGLES;
		else
			key.tessellation = IR3_TESS_ISOLINES;
		break;

	case MESA_SHADER_GEOMETRY:
		key.has_gs = true;
		break;

	default:
		break;
	}

	key.safe_constlen = false;
	struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);
	if (!v)
		return;

	if (v->constlen > compiler->max_const_safe) {
		key.safe_constlen = true;
		ir3_shader_variant(shader, key, false, debug);
	}

	/* For vertex shaders, also compile initial binning pass shader: */
	if (nir->info.stage == MESA_SHADER_VERTEX) {
		key.safe_constlen = false;
		v = ir3_shader_variant(shader, key, true, debug);
		if (!v)
			return;

		if (v->constlen > compiler->max_const_safe) {
			key.safe_constlen = true;
			ir3_shader_variant(shader, key, true, debug);
		}
	}

	shader->initial_variants_done = true;
}

static void
create_initial_variants_async(void *job, int thread_index)
{
	struct ir3_shader_state *hwcso = job;
	struct pipe_debug_callback debug = {};

	create_initial_variants(hwcso, &debug);
}

static void
create_initial_compute_variants_async(void *job, int thread_index)
{
	struct ir3_shader_state *hwcso = job;
	struct ir3_shader *shader = hwcso->shader;
	struct pipe_debug_callback debug = {};
	static struct ir3_shader_key key; /* static is implicitly zeroed */

	ir3_shader_variant(shader, key, false, &debug);
	shader->initial_variants_done = true;
}

/* a bit annoying that compute-shader and normal shader state objects
 * aren't a bit more aligned.
 */
void *
ir3_shader_compute_state_create(struct pipe_context *pctx,
		const struct pipe_compute_state *cso)
{
	struct fd_context *ctx = fd_context(pctx);

	/* req_input_mem will only be non-zero for cl kernels (ie. clover).
	 * This isn't a perfect test because I guess it is possible (but
	 * uncommon) for none for the kernel parameters to be a global,
	 * but ctx->set_global_bindings() can't fail, so this is the next
	 * best place to fail if we need a newer version of kernel driver:
	 */
	if ((cso->req_input_mem > 0) &&
			fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
		return NULL;
	}

	struct ir3_compiler *compiler = ctx->screen->compiler;
	nir_shader *nir;

	if (cso->ir_type == PIPE_SHADER_IR_NIR) {
		/* we take ownership of the reference: */
		nir = (nir_shader *)cso->prog;
	} else {
		debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
		if (ir3_shader_debug & IR3_DBG_DISASM) {
			tgsi_dump(cso->prog, 0);
		}
		nir = tgsi_to_nir(cso->prog, pctx->screen, false);
	}

	struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL);
	struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));

	util_queue_fence_init(&hwcso->ready);
	hwcso->shader = shader;

	/* Immediately compile a standard variant.  We have so few variants in our
	 * shaders, that doing so almost eliminates draw-time recompiles.  (This
	 * is also how we get data from shader-db's ./run)
	 */

	if (initial_variants_synchronous(ctx)) {
		static struct ir3_shader_key key; /* static is implicitly zeroed */
		ir3_shader_variant(shader, key, false, &ctx->debug);
		shader->initial_variants_done = true;
	} else {
		struct fd_screen *screen = ctx->screen;
		util_queue_add_job(&screen->compile_queue, hwcso,
				&hwcso->ready, create_initial_compute_variants_async,
				NULL, 0);
	}

	return hwcso;
}

void *
ir3_shader_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso)
{
	struct fd_context *ctx = fd_context(pctx);
	struct ir3_compiler *compiler = ctx->screen->compiler;
	struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));

	/*
	 * Convert to nir (if necessary):
	 */

	nir_shader *nir;
	if (cso->type == PIPE_SHADER_IR_NIR) {
		/* we take ownership of the reference: */
		nir = cso->ir.nir;
	} else {
		debug_assert(cso->type == PIPE_SHADER_IR_TGSI);
		if (ir3_shader_debug & IR3_DBG_DISASM) {
			tgsi_dump(cso->tokens, 0);
		}
		nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
	}

	/*
	 * Create ir3_shader:
	 *
	 * This part is cheap, it doesn't compile initial variants
	 */

	struct ir3_stream_output_info stream_output = {};
	copy_stream_out(&stream_output, &cso->stream_output);

	hwcso->shader = ir3_shader_from_nir(compiler, nir, 0, &stream_output);

	/*
	 * Create initial variants to avoid draw-time stalls.  This is
	 * normally done asynchronously, unless debug is enabled (which
	 * will be the case for shader-db)
	 */

	util_queue_fence_init(&hwcso->ready);

	if (initial_variants_synchronous(ctx)) {
		create_initial_variants(hwcso, &ctx->debug);
	} else {
		util_queue_add_job(&ctx->screen->compile_queue, hwcso,
				&hwcso->ready, create_initial_variants_async,
				NULL, 0);
	}

	return hwcso;
}

void
ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
{
	struct fd_context *ctx = fd_context(pctx);
	struct fd_screen *screen = ctx->screen;
	struct ir3_shader_state *hwcso = _hwcso;
	struct ir3_shader *so = hwcso->shader;

	ir3_cache_invalidate(ctx->shader_cache, hwcso);

	/* util_queue_drop_job() guarantees that either:
	 *  1) job did not execute
	 *  2) job completed
	 *
	 * In either case the fence is signaled
	 */
	util_queue_drop_job(&screen->compile_queue, &hwcso->ready);

	/* free the uploaded shaders, since this is handled outside of the
	 * shared ir3 code (ie. not used by turnip):
	 */
	for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {
		fd_bo_del(v->bo);
		v->bo = NULL;

		if (v->binning && v->binning->bo) {
			fd_bo_del(v->binning->bo);
			v->binning->bo = NULL;
		}
	}

	ir3_shader_destroy(so);
	util_queue_fence_destroy(&hwcso->ready);
	free(hwcso);
}

struct ir3_shader *
ir3_get_shader(struct ir3_shader_state *hwcso)
{
	if (!hwcso)
		return NULL;

	struct ir3_shader *shader = hwcso->shader;
	perf_time(1000, "waited for %s:%s:%s variants",
			_mesa_shader_stage_to_abbrev(shader->type),
			shader->nir->info.name, shader->nir->info.label) {
		/* wait for initial variants to compile: */
		util_queue_fence_wait(&hwcso->ready);
	}

	return shader;
}

struct shader_info *
ir3_get_shader_info(struct ir3_shader_state *hwcso)
{
	if (!hwcso)
		return NULL;
	return &hwcso->shader->nir->info;
}

/* fixup dirty shader state in case some "unrelated" (from the state-
 * tracker's perspective) state change causes us to switch to a
 * different variant.
 */
void
ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
{
	struct fd_context *ctx = fd_context(pctx);

	if (!ir3_shader_key_equal(ctx->last.key, key)) {
		if (ir3_shader_key_changes_fs(ctx->last.key, key)) {
			fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT, FD_DIRTY_SHADER_PROG);
		}

		if (ir3_shader_key_changes_vs(ctx->last.key, key)) {
			fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);
		}

		/* NOTE: currently only a6xx has gs/tess, but needs no
		 * gs/tess specific lowering.
		 */

		*ctx->last.key = *key;
	}
}

static void
ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir, bool optimize)
{
	struct fd_screen *screen = fd_screen(pscreen);

	ir3_finalize_nir(screen->compiler, nir);
}

static void
ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen, unsigned max_threads)
{
	struct fd_screen *screen = fd_screen(pscreen);

	/* This function doesn't allow a greater number of threads than
	 * the queue had at its creation.
	 */
	util_queue_adjust_num_threads(&screen->compile_queue, max_threads);
}

static bool
ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
		void *shader, enum pipe_shader_type shader_type)
{
	struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;

	return util_queue_fence_is_signalled(&hwcso->ready);
}

void
ir3_prog_init(struct pipe_context *pctx)
{
	pctx->create_vs_state = ir3_shader_state_create;
	pctx->delete_vs_state = ir3_shader_state_delete;

	pctx->create_tcs_state = ir3_shader_state_create;
	pctx->delete_tcs_state = ir3_shader_state_delete;

	pctx->create_tes_state = ir3_shader_state_create;
	pctx->delete_tes_state = ir3_shader_state_delete;

	pctx->create_gs_state = ir3_shader_state_create;
	pctx->delete_gs_state = ir3_shader_state_delete;

	pctx->create_fs_state = ir3_shader_state_create;
	pctx->delete_fs_state = ir3_shader_state_delete;
}

void
ir3_screen_init(struct pipe_screen *pscreen)
{
	struct fd_screen *screen = fd_screen(pscreen);

	screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id);

	/* TODO do we want to limit things to # of fast cores, or just limit
	 * based on total # of both big and little cores.  The little cores
	 * tend to be in-order and probably much slower for compiling than
	 * big cores.  OTOH if they are sitting idle, maybe it is useful to
	 * use them?
	 */
	unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1;

	util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
			UTIL_QUEUE_INIT_RESIZE_IF_FULL |
			UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY);

	pscreen->finalize_nir = ir3_screen_finalize_nir;
	pscreen->set_max_shader_compiler_threads =
			ir3_set_max_shader_compiler_threads;
	pscreen->is_parallel_shader_compilation_finished =
			ir3_is_parallel_shader_compilation_finished;
}

void
ir3_screen_fini(struct pipe_screen *pscreen)
{
	struct fd_screen *screen = fd_screen(pscreen);

	util_queue_destroy(&screen->compile_queue);
	ir3_compiler_destroy(screen->compiler);
	screen->compiler = NULL;
}

void
ir3_update_max_tf_vtx(struct fd_context *ctx, const struct ir3_shader_variant *v)
{
	struct fd_streamout_stateobj *so = &ctx->streamout;
	struct ir3_stream_output_info *info = &v->shader->stream_output;
	uint32_t maxvtxcnt = 0x7fffffff;

	if (v->shader->stream_output.num_outputs == 0)
		ctx->streamout.max_tf_vtx = 0;
	if (so->num_targets == 0)
		ctx->streamout.max_tf_vtx = 0;

	/* offset to write to is:
	 *
	 *   total_vtxcnt = vtxcnt + offsets[i]
	 *   offset = total_vtxcnt * stride[i]
	 *
	 *   offset =   vtxcnt * stride[i]       ; calculated in shader
	 *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
	 *
	 * assuming for each vtx, each target buffer will have data written
	 * up to 'offset + stride[i]', that leaves maxvtxcnt as:
	 *
	 *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
	 *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
	 *
	 * but shader is actually doing a less-than (rather than less-than-
	 * equal) check, so we can drop the -stride[i].
	 *
	 * TODO is assumption about `offset + stride[i]` legit?
	 */
	for (unsigned i = 0; i < so->num_targets; i++) {
		struct pipe_stream_output_target *target = so->targets[i];
		unsigned stride = info->stride[i] * 4;   /* convert dwords->bytes */
		if (target) {
			uint32_t max = target->buffer_size / stride;
			maxvtxcnt = MIN2(maxvtxcnt, max);
		}
	}

	ctx->streamout.max_tf_vtx = maxvtxcnt;
}
